import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import doubletdetection
import warnings
warnings.filterwarnings('ignore')
import logging
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=130)
ko = sc.read_h5ad('./output/ko.preprocessing.h5ad')
ko.shape
#hvgs = adata_ko.var['highly_variable'].intersection(adata_wt.var['highly_variable'])
sc.pp.highly_variable_genes(ko, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(ko)
np.sum(ko.var['highly_variable'])
sc.pp.scale(ko, max_value=10)
sc.tl.pca(ko, svd_solver='arpack')
sc.pl.pca_variance_ratio(ko, log=True)
sc.pp.neighbors(ko)
sc.tl.umap(ko)
sc.tl.louvain(ko, resolution=1.2)
sc.pl.umap(ko, color=['louvain'])
mean_cellType = np.empty((len(ko.obs['louvain'].unique()), ko.raw.shape[1]),
dtype=float, order='C')
raw_adata = ko.raw.X.toarray()
for i in range(0, len(ko.obs['louvain'].unique())):
#print(adata.obs['phenograph'].unique()[i])
mean_cellType[i,:] = np.mean(raw_adata[ko.obs['louvain'] == ko.obs['louvain'].unique()[i], :], axis = 0)
mean_df = pd.DataFrame(np.corrcoef(mean_cellType), index = ko.obs['louvain'].unique(), columns = ko.obs['louvain'].unique())
import seaborn
ax = seaborn.clustermap(mean_df, cmap="jet").fig.suptitle('louvain')
sc.tl.rank_genes_groups(ko, 'louvain', method='wilcoxon')
sc.pl.rank_genes_groups(ko, n_genes=25, sharey=False)
pd.DataFrame(ko.uns['rank_genes_groups']['names']).head(50)
sc.pl.umap(ko, color=['Mki67', 'Top2a', 'Ube2c'], color_map="jet") # Cycling
sc.pl.umap(ko, color=['Lef1','Itm2a'], color_map="jet") # NKT0
sc.pl.umap(ko, color=['Plac8','Icos'], color_map="jet") # NKT2
sc.pl.umap(ko, color=['Tmem176a', 'Emb'], color_map="jet") # NKT17
sc.pl.umap(ko, color=['Klrb1c','Il2rb'], color_map="jet") # NKT1
sc.pl.umap(ko, color=['Ifit1','Ifit3','Isg15'], color_map="jet") # NKT1d
sc.pl.umap(ko, color=['louvain'], legend_loc = 'on data', legend_fontsize = 6)
marker_genes = ['Lef1','Itm2a','Mki67','Top2a','Zbtb16','Plac8','Izumo1r','Ifit1','Ifit3','Isg15','Ly6a','Fhl2','Nkg7','Fcer1g','Gzma','Gzmb','Rorc','Serpinb1a','Tmem176a','Tmem176b']
ax = sc.pl.dotplot(ko, marker_genes, groupby='louvain')
new_cluster_names = {
'0':'NKT2b', '1':'NKT1b', '2':'NKT1d', '3':'Cycling NKT', '4':'NKT1c',
'5':'NKT2b','6':'NKT17', '7':'NKT1b', '8':'Cycling NKT', '9':'NKT2a',
'10':'NKT0'}
vect = []
for i in range(0, len(ko.obs['louvain'])):
vect = vect + [new_cluster_names[str(ko.obs['louvain'][i])]]
ko.obs['cell_type'] = vect
sc.pl.umap(ko, color='cell_type', title='', frameon=False)
ko.obs['cell_type'].cat.categories
ko.obs['cell_type'].cat.reorder_categories(['NKT0','Cycling NKT','NKT17','NKT2a','NKT2b','NKT1b','NKT1c','NKT1d'], inplace = True)
ko.uns['cell_type_colors'] = [ '#A31E22', # NKT0
'#F3766E', # Cycling NKT
'#2da9d2', # NKT17
'#FEC85A', # NKT2a
'#FAE600', # NKT2b
#'#50C878', # NKT1a
'#2E8B57', # NKT1b
'#0B6623', # NKT1c
'#4CBB17'] # NKT1d
sc.pl.umap(ko, color='cell_type', title='', frameon=False, save='_ko.ann.pdf')
ko.shape
marker_genes = ['Lef1','Itm2a','Mki67','Top2a','Zbtb16','Plac8','Izumo1r','Ifit1','Ifit3','Isg15','Ly6a','Fhl2','Nkg7','Fcer1g','Gzma','Gzmb','Rorc','Serpinb1a','Tmem176a','Tmem176b']
ax = sc.pl.dotplot(ko, marker_genes, groupby='cell_type')
marker_genes = ['Blk', 'Rorc', 'Zbtb7b', 'Id2', 'Tbx21', 'Tox', 'Zbtb16', 'Gata3', 'E2f8', 'E2f2', 'Sox4','Lef1']
ax = sc.pl.dotplot(ko, marker_genes, groupby='cell_type')
sc.pl.matrixplot(ko, marker_genes, groupby='cell_type')
ko.write('./output/ko.ann.h5ad')
ko.obs['umap_1'] = pd.DataFrame(ko.obsm['X_umap']).iloc[:,0].tolist()
ko.obs['umap_2'] = pd.DataFrame(ko.obsm['X_umap']).iloc[:,1].tolist()
ko.obs.to_csv(path_or_buf = 'output/metadata.ko.tsv', sep = '\t', index = True)
sc.external.exporting.cellbrowser(ko,
'cellbrowser/ko',
'ko',
annot_keys=['cell_type', 'louvain', 'sample',
'n_counts', 'n_genes', 'percent_mito', 'percent_ribo'],
cluster_field='cell_type', nb_marker=30,
skip_matrix=False, do_debug=True)
adata_ann = sc.read_h5ad('./output/ko.ann.h5ad')
adata_raw = sc.read_h5ad('./output/ko.preprocessing.h5ad')
adata_raw.shape
tokeep = []
for x in adata_ann.obs['cell_type']:
tokeep = tokeep + [x in ['Cycling NKT']]
#keep_cells = adata_ann.obs['cell_type'] == ['Cycling NKT']
list_of_cell_names = adata_ann.obs_names[tokeep]
adata_cc = adata_raw[list_of_cell_names, ]
adata_cc.obs['cell_type'] = adata_ann.obs['cell_type']
adata_cc.shape
sc.pp.highly_variable_genes(adata_cc, min_mean=0.25, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata_cc)
np.sum(adata_cc.var['highly_variable'])
sc.pp.scale(adata_cc, max_value=10)
sc.tl.pca(adata_cc, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata_cc)# log = True
sc.pl.pca_loadings(adata_cc, components=list(range(0,7)))
sc.pp.neighbors(adata_cc)
sc.tl.umap(adata_cc)
# cell cycle scoring
cell_cycle_genes = [x.strip() for x in open('/data/10x_data/cell_cycle_vignette_files/regev_lab_cell_cycle_genes_mouse.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata_cc.var_names]
sc.tl.score_genes_cell_cycle(adata_cc, s_genes=s_genes, g2m_genes=g2m_genes)
sc.pl.umap(adata_cc, color=['cell_type','phase','G2M_score','S_score'], edges = False)
sc.pl.umap(adata_cc, color=['Plac8','Icos','Izumo1r','Itm2a','Lef1','Tmem176a','Emb','Klrb1c','Il2rb'], edges = False)